The goal is to show that Word2Vec (via https://radimrehurek.com/gensim/) can be used to score sentences against a target term to rank most-similar sentences towards the target. The dataset is Google's pre-trained vectors (https://code.google.com/archive/p/word2vec/) using Google News with 300 dimensions and approximately 100 billion words. Note that this model is trained on News data, which doesn't necessarily map to the Brands domain shown below (it is good enough for this proof of concept).
I've also drawn a T-SNE visualisation of word similarity for a selection of approximately 100 words. This might be a useful diagnostic.
Finally I show an old plot of similarity-stability when training on different vocabulary sizes, this is another diagnostic to test how well w2v has learned the underlying relationships (high stability means it has probably learned the underlying relationships to a higher fidelity). This shows what happens when we start to train our own datasets to replace the Google pre-built model with something suited to a particular domain (the stability example comes from 500,000 documents of recruitment data).
This is related to my Data Science Delivered collection of notes.
License: CC-ByAttribution
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pprint
from gensim.models import word2vec
%matplotlib inline
In [2]:
if False:
# start an interactive console
%qtconsole style=monokai
In [3]:
# %install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
# show a watermark for this environment
%watermark -d -m -v -p numpy,matplotlib -g
In [4]:
# load w2v from pre-built Google data
w2v = word2vec.Word2Vec()
# download bin.gz from: https://code.google.com/archive/p/word2vec/
w2v = w2v.load_word2vec_format("/home/ian/data/word2vec/GoogleNews-vectors-negative300.bin", binary=True)
w2v_vocab = set(w2v.vocab)
print("Loaded {} words in vocabulary".format(len(w2v_vocab)))
In [5]:
# example of a similarity query, result in the range [0..1]
w2v.similarity("Pepsi", "Coke")
Out[5]:
In [6]:
words = ["Coca_Cola", "Pepsi", "pepsi", "cola", "Microsoft", "Samsung", "Apple", "Google"]
similarities = np.zeros((len(words), len(words)), dtype=np.float_)
for idx1, word1 in enumerate(words):
for idx2, word2 in enumerate(words):
# note KeyError is possible if word doesn't exist
sim = w2v.similarity(word1, word2)
similarities[idx1, idx2] = sim
df = pd.DataFrame.from_records(similarities, columns=words)
df.index = words
In [7]:
f, ax=plt.subplots(1, 1, figsize=(14,8))
cmap = plt.cm.Blues
mask = np.zeros_like(df)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(df, cmap=cmap, mask=mask, square=True, ax=ax)
_=plt.yticks(rotation=90)
plt.xlabel('Words')
_=plt.xticks(rotation=45)
_=plt.title("Similarities between words")
# we can see that pepsi/coke are similar and tech-brands are similar, but they're not similar between each group
In [8]:
# example sentences (pretend these are from subtitles)
sentences = ["I'm happy to shop in Walmart and buy a Google phone",
"In today's demo we'll look at Office and Word from microsoft",
"Tech companies like Apple with their iPhone are the new cool",
"Yesterday I went swimming",
"Pepsi is drunk by a New Generation",
"Bob has an Android Nexus 5 for his telephone", # Android and Google and 0.56 similar in this model
"Alice drinks coffee every morning",
"I want to drink a coke and eat something",
"You'll be happier if you take a swim",
"This is a really long sentence that hopefully doesn't get a very high score just because it has lots of words in it!"]
# here's a target we'd like to score against
#target_sentence = "You'd love to drink a cool refreshing Coke"
target_sentence = "Microsoft smartphones are the latest buzz"
In [9]:
# use n_similarity to compute a cosine similarity (should be reasonably robust)
sentences_similarity = np.zeros(len(sentences))
target_sentence_words = [w for w in target_sentence.split() if w in w2v_vocab]
for idx, sentence in enumerate(sentences):
sentence_words = [w for w in sentence.split() if w in w2v_vocab]
sim = w2v.n_similarity(target_sentence_words, sentence_words)
sentences_similarity[idx] = sim
result = list(zip(sentences_similarity, sentences))
result.sort(key=lambda item:item[0], reverse=True)
print("Target:", target_sentence)
pprint.pprint(result)
# Show the target phrase and the most-similar target phrases
In [10]:
# try a naive mean of scores method, this is far less robust
sentences_similarity = np.zeros(len(sentences))
for idx, sentence in enumerate(sentences):
sentence_words = sentence.split()
for word in sentence_words:
sim_to_sentence = 0
for target in target_sentence.split():
try:
sim_to_sentence += w2v.similarity(word, target)
except KeyError:
pass # ignore words that aren't in vocabulary
sim_to_sentence /= len(sentence_words)
sentences_similarity[idx] += sim_to_sentence
result = list(zip(sentences_similarity, sentences))
result.sort(key=lambda item:item[0], reverse=True)
print("Target:", target_sentence)
pprint.pprint(result)
In [11]:
print(w2v['King'].shape) # 300 dimension vector for 1 word
In [17]:
# classic king-man+woman=queen demo
w2v.most_similar(positive=["king", "woman"], negative=['man'])
Out[17]:
In [13]:
# we can ask for very similar terms
w2v.most_similar(positive=["Coca_cola"], negative=[])
Out[13]:
In [14]:
# cola and drink but not soft-drink is...
w2v.most_similar(positive=["Coke", "beverage"], negative=['soft_drink'])
# Diageo (UK spirits producer inc. Baileys, Moët Hennessy)
# Glacéau (enhanced water/energy drinks)
Out[14]:
In [15]:
# a drink like coke but non-alcoholic
w2v.most_similar(positive=["Coke", "drink"], negative=['alcohol'])
Out[15]:
In [16]:
from sklearn.manifold import TSNE
raw_words_of_interest = ['Coke', 'Pepsi', 'cola', 'drink',
'cool', 'swim', 'swimming', 'thirst',
'Microsoft', 'Oracle',
'smartphone', 'cruise']
# some other random stuff we chould throw in...
# 'King', 'Queen', 'person', 'walking', 'dancing', 'news', 'food', 'kitchen', 'house']
words_of_interest = []
for woi in raw_words_of_interest:
for word, _ in w2v.most_similar(woi):
words_of_interest.append(word)
words_of_interest = list(set(words_of_interest))
vectors = []
for word in set(words_of_interest):
vectors.append(w2v[word])
vectors = np.vstack(vectors) # turn vectors into a 2D array <words x 300dim>
model = TSNE(n_components=2, random_state=0)
X_tsne = model.fit_transform(vectors)
df_after_tsne = pd.DataFrame.from_records(X_tsne, columns=['x', 'y'])
df_after_tsne['labels'] = words_of_interest
# calculate similarity from a target word to all words, to use as our colour
target_word = "smartphone"
similarities = []
for woi in words_of_interest:
similarity = min(max(0, w2v.similarity(target_word, woi)), 1.0)
similarities.append(similarity)
# plot the T-SNE layout for words, darker words means more similar to our target
plt.figure(figsize=(12,8))
plt.xlim((min(X_tsne[:,0]), max(X_tsne[:,0])))
plt.ylim((min(X_tsne[:,1]), max(X_tsne[:,1])))
for idx in range(X_tsne.shape[0]):
x, y = X_tsne[idx]
label = words_of_interest[idx]
color=str(min(0.6, 1.0-similarities[idx])) # convert to string "0.0".."1.0" as greyscale for mpl
plt.annotate(s=label, xy=(x, y), color=color)
#plt.annotate(s=label, xy=(x, y), weight=int(similarities[idx]*1000)) # use weight
plt.tight_layout()
_=plt.title("Word similarity (T-SNE) using vectors from {} words\nColoured by similarity to '{}'".format(len(words_of_interest), target_word))
This example shows how adding more sentence examples (i.e. Moar Data!) to 5 model rebuilds causes greater stability (more similar similarity measures) (original tweet):